** Clearing Stata memory
capture log close
clear all
set more off, perm
set seed 1234

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////// Table 5: Priority Subjects, Omitted Questions and Zeros (2001-2002) ////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

** Opening Phase 2 norm_scores dataset 
use "Work Data/Gender_Phase2_long.dta",clear

*** Creating variables
encode subject, gen (sub)
tab subject, gen (d_sub)
label var sub "Subject"

** Subject dummies
rename d_sub1 Biology
rename d_sub2 Chemistry
rename d_sub3 Geography
rename d_sub4 History
rename d_sub5 Language
rename d_sub6 Mathematics
rename d_sub7 Physics
rename d_sub8 Portuguese
* Labels
label var Biology "Biology"
label var Chemistry "Chemistry"
label var Geography "Geography"
label var History "History"
label var Math "Mathematics"
label var Physics "Physics"
label var Portuguese "Portuguese"
label var Language "Foreign Language"

** Interaction: priority X female
gen fem_priority=female*priority
label var fem_priority "Female $\times$ Priority"

** Interaction: priority X subject
foreach v of varlist Biology-Portuguese {
gen fem_`v'=`v'*female
label var fem_`v' "Female $\times$ `v'"
gen prio_`v'=priority*`v'
label var prio_`v' "Priority $\times$ `v'"
gen fem_prio_`v'=fem_priority*`v'
label var fem_prio_`v' "Female $\times$ Priority $\times$ `v'"
}

global subject "Chemistry Geography History Mathematics Physics"
global subject_fem "fem_Chemistry fem_Geography fem_History fem_Mathematics fem_Physics"

** P1 scores: P1 normalized subject-specific scores
forvalues i=2(1)4 {
gen norm_p1score`i'=norm_p1score^`i'
sum norm_p1score`i'
}

*********************************************************************************
****************   Relative performances ****************************************
*********************************************************************************

******************************** ENEM ********************************
foreach v in norm_enem_w {
bys year female: egen `v'_ave_g=mean(`v')
gen `v'_g=`v'-`v'_ave_g
bys year female: sum `v'_g
}
drop norm_enem_w_ave_g
label var norm_enem_w_g "ENEM"

* Interaction: subject X ENEM
foreach v of varlist Biology-Portuguese {
gen enem_`v'=`v'*norm_enem_w_g
label var enem_`v' "ENEM $\times$ `v'"
gen fem_enem_`v'=female*norm_enem_w_g*`v'
label var fem_enem_`v' "Female $\times$ ENEM $\times$ `v'"
forvalues i=2(1)4 {
gen enem_`v'_`i'=enem_`v'^`i'
gen fem_enem_`v'_`i'=fem_enem_`v'^`i'
}
sum enem_`v'* fem_enem_`v'*
}

global g_pol_enem_sub "enem_Chemistry* enem_Geography* enem_History* enem_Mathematics* enem_Physics*"
d $g_pol_enem_sub

* Priority x relative performance in ENEM:
foreach v in norm_enem_w {
gen `v'_priority_g=`v'_g*priority
forvalues i=2(1)4 {
gen `v'_priority_g`i'=`v'_g^`i'*priority
sum `v'_priority_g`i'
}
}

global g_norm_enem_w_prio norm_enem_w_priority_g*
d $g_norm_enem_w_prio

******************************** Phase 1 scores ********************************

foreach v in norm_p1score {
bys year subject female: egen gs_`v'_ave=mean(`v')
gen gs_`v'=`v'-gs_`v'_ave
bys year female subject: sum gs_`v'
drop gs_`v'_ave

forvalues i=2(1)4 {
gen gs_`v'`i'=gs_`v'^`i'
sum gs_`v'`i'
}

global gs_pol_`v' gs_`v' gs_`v'2 gs_`v'3 gs_`v'4
d $gs_pol_`v'

* Priority x Phase 1 scores:
gen gs_`v'_prio=gs_`v'*priority
forvalues i=2(1)4 {
gen gs_`v'_prio`i'=gs_`v'`i'*priority
sum gs_`v'_prio*
}
}

global gs_pol_norm_p1score_prio gs_norm_p1score_prio*
d $gs_pol_norm_p1score_prio

*********************************************************************************
**************** Main sample ****************************************************
*********************************************************************************

* 1) Only years before the affirmative action took place
drop if aa_year==1
tab year
drop if year==2000
tab year

* 2) Drop Portuguese and Foreign Language (in Phase 1 there is no Portuguese or Foreign Language exams - For Portuguese Phase 1 has an essay)
 tab subject, sum(norm_p1score)
 drop if subject=="lang" | subject=="port" 
 tab subject, sum(norm_p1score)
 drop Language Portuguese prio_Language prio_Portuguese fem_prio_Language fem_prio_Portuguese 

*********************************************************************************
****************   Regressions **************************************************
*********************************************************************************

* Renaming variables for table labels
rename totalzeros zero_scores
rename totalmissing omissions
rename total_zeros_missing omissions_zeroscores

/* 
We can only obtain information on missings (questions that the student did not answer) from years 2001 to 2002
As the missing variable is only available for those years, I will restrict all related dependent variables to those years
The number of zeros from 2001 to 2002 excludes the questions that the student got zero because he did not answer
Therefore, total zeros = # of questions in which the student answered and the grade was zero)
*/
foreach v of varlist zero_scores omissions omissions_zeroscores {
tab year, sum(`v')
replace `v' =. if year>2002
tab year, sum(`v')
}


*********************************************************************************
****************   Regressions **************************************************
*********************************************************************************

 estimates clear
 
foreach v of varlist omissions zero_scores  omissions_zeroscores  {

reghdfe `v' priority fem_priority $subject $subject_fem $g_pol_enem_sub $g_norm_enem_w_prio $gs_pol_norm_p1score $gs_pol_norm_p1score_prio, cluster(inscri2) absorb(inscri2)
estimates store `v'
estadd ysumm

estadd local sub_fe "Yes": `v'
estadd local subgender_fe "Yes": `v'
estadd local ind_fe "Yes": `v'
estadd local enemsub "Yes":  `v'
estadd local enemprio_pol4 "Yes": `v'
estadd local p1score_pol4 "Yes": `v'
estadd local p1scoreprio_pol4 "Yes": `v'

}

esttab omissions zero_scores  omissions_zeroscores using "Output/table_omissions_zeros.tex", se star(* 0.10 ** 0.05 *** 0.01) replace nogap  f label collabels(none) nomtitle ///
stats(ymean ysd sep r2_a N N_clust line sub_fe subgender_fe ind_fe enemsub enemprio_pol4 p1score_pol4 p1scoreprio_pol4, fmt(%3.2fc %3.2fc %1s %9.3fc %9.0fc %9.0fc %1s %3s %3s %3s %3s %3s %3s %3s) labels("Mean dependent variable" "Std.dev dependent variable" " " "$\bar{R}^2$" "Number of observations" "Number of applicants" " " "Subject FE" "Subject-gender FE" ///
 "Individual FE" "ENEM $\times$ Subject FE" "ENEM $\times$ Priority" "Phase 1 scores" "Phase 1 scores $\times$ Priority")) b(%7.3f) se(%7.3f)  keep(priority fem_priority) /// 
 booktabs mgroups("Omissions" "Zero scores"  "Zero + Omissions", pattern(1 1 1) prefix(\multicolumn{@span}{c}{) suffix(}) span erepeat(\cmidrule(lr){@span})) 
